1. Data Set Preparation

	1.1 Create the folder /root/TrainingOnHDP/dataset/spark in your sandbox

	1.2 Upload all data files (including subfolders) into your sandbox /root/TrainingOnHDP/dataset/spark

	1.3 Login into Sandbox, and run the followings:

		hadoop fs -mkdir /user/root/works
		hadoop fs -chmod -R 777 /user/root/works
		hadoop fs -mkdir /root
		hadoop fs -mkdir /root/labs
		hadoop fs -mkdir /root/labs/datasets
		hadoop fs -put /root/TrainingOnHDP/dataset/spark /root/labs
		hadoop fs -chmod -R 777 /root/labs/datasets
		
2. Creating DataFrames
	2.1 Example:
		val existingRDD = sc.textFile("file:/root/TrainingOnHDP/dataset/spark/people.txt")
		val df = existingRDD.toDF()
		df.foreach(println)
	
	2.2 Example:
		val df1 = sqlContext.sql("select * from sample_07")
		df1.foreach(println)

	2.3 Example:
		val df2 = sqlContext.read.json("/root/labs/datasets/labs/people.json")
		df2.printSchema
		df2.foreach(println)

		
3. DataFrame Operations
	3.1 Example:
		val df3 = sqlContext.read.json("/root/labs/datasets/labs/people.json")
		df3.show()
		df3.printSchema()
		df3.select("name").show()
		df3.select(df3("name"), df3("age") + 1).show()
		df3.filter(df3("age") > 21).show()
		df3.groupBy("age").count().show()

4. DataFrame Operations
	4.1 Example:		
		val ds = Seq(1, 2, 3).toDS()
		ds.map(_ + 1).collect()
		case class Person(name: String, age: Long)
		val ds = Seq(Person("Andy", 32)).toDS()
		val path = "/root/labs/datasets/labs/people.json"
		val people = sqlContext.read.json(path).as[Person]
		prople.show()

5. Inferring the Schema Using Reflection
	5.1 Example:
		import sqlContext.implicits._
		case class Person(name: String, age: Int)
		val people = sc.textFile("/root/labs/datasets/labs/people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt)).toDF()
		people.registerTempTable("people")
		val teenagers = sqlContext.sql("SELECT name, age FROM people WHERE age >= 13 AND age <= 19")
		teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
		teenagers.map(t => "Name: " + t.getAs[String]("name")).collect().foreach(println)
		teenagers.map(_.getValuesMap[Any](List("name", "age"))).collect().foreach(println)

6. Programmatically Specifying the Schema
	6.1 Example:
		val people = sc.textFile("/root/labs/datasets/labs/people.txt")
		val schemaString = "name age"
		import org.apache.spark.sql.Row;
		import org.apache.spark.sql.types.{StructType,StructField,StringType};
		val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))
		val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim))
		val peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema)
		peopleDataFrame.registerTempTable("people")
		val results = sqlContext.sql("SELECT name FROM people")
		results.map(t => "Name: " + t(0)).collect().foreach(println)

7. Hive Integration
	7.1 Example
		sqlContext.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
		sqlContext.sql("LOAD DATA LOCAL INPATH '/training/apps/spark/datasets/labs/kv1.txt' INTO TABLE src")
		sqlContext.sql("FROM src SELECT key, value").collect().foreach(println)

8. Generic Load/Save Functions
	8.1 Example
		val df = sqlContext.read.load("/root/labs/datasets/labs/users.parquet") 
		df.select("name", "favorite_color").write.save("namesAndFavColors.parquet")

9. Manually Specifying Options
	9.1 Example
		val df = sqlContext.read.format("json").load("/root/labs/datasets/labs/people.json") 
		df.select("name", "age").write.format("parquet").save("namesAndAges.parquet")

10. Run SQL on files directly
	10.1 Example
		val df = sqlContext.sql("SELECT * FROM parquet.`/root/labs/datasets/labs/users.parquet`")

11. Example of Working with a JSON DataSet
	11.1 Example
		val rdd = sc.parallelize("""{"name":"Bin","address":{"city":"Toronto","state":"ON"}}""" :: Nil)
		val df = sqlContext.read.json(rdd)
		val df1 = df.select("name")
		df1.show()
		df1.write.format("parquet").save("/user/root/users.parquet")
	
12. Example of Working with a JSON DataSet
	12.1 Example
		val path = "/root/labs/datasets/labs/people.json"
		val people = sqlContext.read.json(path)
		people.printSchema()
		people.registerTempTable("people")
		val teenagers = sqlContext.sql("SELECT * FROM people WHERE age >= 13 AND age <= 19")
		teenagers.write.format("parquet").save("/user/root/people.parquet")
		
13. Example of Working with a Parquet File
	13.1 Example
		val parquetFile = sqlContext.read.parquet("/user/root/people.parquet")
		parquetFile.registerTempTable("parquetFile")
		val teenagers = sqlContext.sql("SELECT * FROM parquetFile")
		teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
		
14. JDBC to RDBMS Sources
	14.1 Exit from current spark shell by typing sc.stop(), then exit
	14.2 Enter the following command:
		spark-shell --jars  /usr/hdp/current/hive/lib/mysql-connector-java.jar
		val dataframe_mysql = sqlContext.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/hive").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "ROLES").option("user", "root").option("password", "").load()
		dataframe_mysql.show()

15. Building SparkSession using Fluent API
		
	import org.apache.spark.sql.SparkSession
	val spark: SparkSession = SparkSession.builder.appName("My Spark Application").master("local[*]").enableHiveSupport().getOrCreate	
	
16. Dataset - Strongly-Typed Structured Query with Encoder

	val dataset = spark.range(5)
	
	// Variant 1: filter operator accepts a Scala function
	dataset.filter(n => n % 2 == 0).count

	// Variant 2: filter operator accepts a Column-based SQL expression
	dataset.filter('id % 2 === 0).count

	// Variant 3: filter operator accepts a SQL query
	dataset.filter("id % 2 = 0").count
	
	spark.range(1).filter('id === 0).explain(true)
	spark.range(1).filter(_ == 0).explain(true)
	
	// Get RDD from dataset
	val dataset = spark.range(5).withColumn("group", 'id % 2)
	dataset.rdd.toDebugString

17. Dataset and DataFrame Conversion

	val ds = Seq("I am a shiny Dataset!").toDS
	val df = Seq("I am an old grumpy DataFrame!").toDF
	val df = Seq("I am an old grumpy DataFrame!").toDF("text")
	val ds = sc.parallelize(Seq("hello")).toDS
	
	case class Token(name: String, productId: Int, score: Double)

	val data = Seq(
		Token("aaa", 100, 0.12),
		Token("aaa", 200, 0.29),
		Token("bbb", 200, 0.53),
		Token("bbb", 300, 0.42))

	// Transform data to a Dataset[Token]
	val ds = data.toDS
	
	// Transform data into a DataFrame with no explicit schema
	val df = data.toDF
	
	// Transform DataFrame into a Dataset
	val ds = df.as[Token]

	ds.show
	ds.printSchema

	// Work with Row instances in DataFrames
	df.map(_.getClass.getName).show(false)

	// Work with Row instances in Dataset
	ds.map(_.getClass.getName).show(false
	
	
18. Encoders — Internal Row Converters

	import org.apache.spark.sql.Encoders
	import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
	import org.apache.spark.sql.catalyst.dsl.expressions._

	case class Person(id: Long, name: String)
	
	val personEncoder = Encoders.product[Person]
	personEncoder.schema
	
	personEncoder.clsTag

	val personExprEncoder = personEncoder.asInstanceOf[ExpressionEncoder[Person]]
	
	personExprEncoder.flat

	// The Serializer part of the encoder
	personExprEncoder.serializer
	
	// The Deserializer part of the encoder
	personExprEncoder.deserializer

	personExprEncoder.namedExpressions

	// A record in a Dataset[Person]
	// A mere instance of Person case class
	// There could be a thousand of Person in a large dataset
	val jacek = Person(0, "Jacek")
	
	// Serialize a record to the internal representation, i.e. InternalRow
	val row = personExprEncoder.toRow(jacek)
	
	// Spark uses InternalRows internally for IO
	// Deserialize it to a JVM object, i.e. a Scala object
	val attrs = Seq(DslSymbol('id).long, DslSymbol('name).string)

	val jacekReborn = personExprEncoder.resolveAndBind(attrs).fromRow(row)

	jacek == jacekReborn

	
19. ExpressionEncoder — Expression-Based Encoder

	import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
	import org.apache.spark.sql.catalyst.expressions.UnsafeRow

	val stringEncoder = ExpressionEncoder[String]
	val row = stringEncoder.toRow("hello world")

	val unsafeRow = row match { case ur: UnsafeRow => ur }
	
	
20. Basic Aggregation — Typed and Untyped Grouping Operators

	Exit Spark-shell and re-enter Spark-shell
	
	spark.range(10).agg(sum('id) as "sum").show

	val ints = 1 to math.pow(10, 3).toInt
	val nms = ints.toDF("n").withColumn("m", 'n % 2)
	nms.count
	
	val q = nms.groupBy('m).agg(sum('n) as "sum").orderBy('m)
	
	q.show

	case class Token(name: String, productId: Int, score: Double)
	val data = Seq(
		Token("aaa", 100, 0.12),
		Token("aaa", 200, 0.29),
		Token("bbb", 200, 0.53),
		Token("bbb", 300, 0.42))
	
	val tokens = data.toDS.cache
	
	tokens.show

	tokens.groupBy('name).avg().show
	
	tokens.groupBy('name, 'productId).agg(Map("score" -> "avg")).show

	tokens.groupBy('name).count.show

	tokens.groupBy('name).max("score").show
	
	tokens.groupBy('name).sum("score").show

	tokens.groupBy('productId).sum("score").show

	tokens.groupByKey(_.productId).count.orderBy($"value").show
	
	import org.apache.spark.sql.expressions.scalalang._

	val q = tokens.groupByKey(_.productId).agg(typed.sum[Token](_.score)).toDF("productId", "sum").orderBy('productId)
	
	q.show
	
	// groupBy operator
	val countsAndSums = spark.range(10).withColumn("group", 'id % 2).groupBy("group").agg(count("id") as "count", sum("id") as "sum")
	countsAndSums.show
	println(countsAndSums.queryExecution.logical.numberedTreeString)

	// rollup operator
	val rollupQ = spark.range(2).rollup('id).agg(count('id))
	println(rollupQ.queryExecution.logical.numberedTreeString)
	
	// cube operator
	val cubeQ = spark.range(2).cube('id).agg(count('id))
	println(cubeQ.queryExecution.logical.numberedTreeString)

	// pivot operator
	val pivotQ = spark.range(10).withColumn("group", 'id % 2).groupBy("group").pivot("group").agg(count("id"))
	println(pivotQ.queryExecution.logical.numberedTreeString)
	
	val visits = Seq(
		(0, "Warsaw", 2015),
		(1, "Warsaw", 2016),
		(2, "Boston", 2017)).toDF("id", "city", "year")
		
	val q = visits.groupBy("city").pivot("year").count()
	q.show
	q.explain
	q.queryExecution.logical

	visits.groupBy('city).pivot("year", Seq("2015")).count.show

	
21. Join

	val left = Seq((0, "zero"), (1, "one")).toDF("id", "left")
	val right = Seq((0, "zero"), (2, "two"), (3, "three")).toDF("id", "right")
	
	// Inner join
	left.join(right, "id").show

	// Full outer
	left.join(right, Seq("id"), "fullouter").show
	left.join(right, Seq("id"), "fullouter").explain
	
	// Left anti
	left.join(right, Seq("id"), "leftanti").show
	
	// Inner equi-join
	case class Person(id: Long, name: String, cityId: Long)
	case class City(id: Long, name: String)

	val family = Seq(
		Person(0, "Agata", 0),
		Person(1, "Iweta", 0),
		Person(2, "Patryk", 2),
		Person(3, "Maksym", 0)).toDS

	val cities = Seq(
		City(0, "Warsaw"),
		City(1, "Washington"),
		City(2, "Sopot")).toDS
		
	val joined = family.joinWith(cities, family("cityId") === cities("id"))
	joined.printSchema
	joined.show

	
22. Broadcast Joins

	// auto broadcast joins	
	val threshold =  spark.conf.get("spark.sql.autoBroadcastJoinThreshold").toInt
	val q = spark.range(100).as("a").join(spark.range(100).as("b")).where($"a.id" === $"b.id")
	println(q.queryExecution.logical.numberedTreeString)
	println(q.queryExecution.sparkPlan.numberedTreeString)
	q.explain

	// disable broadcast joins	
	spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
	q.explain

	// Force BroadcastHashJoin with broadcast hint (as function)
	val qBroadcast = spark.range(100).as("a").join(broadcast(spark.range(100)).as("b")).where($"a.id" === $"b.id")
	qBroadcast.explain

	// Force BroadcastHashJoin using SQL's BROADCAST hint
	// Supported hints: BROADCAST, BROADCASTJOIN or MAPJOIN
	val qBroadcastLeft = """
		SELECT /*+ BROADCAST (lfj) */ *
		FROM (select * from range(100)) lfj, (select * from range(1000)) rtj
		WHERE lfj.id = rtj.id
	"""
	sql(qBroadcastLeft).explain
	
	val qBroadcastRight = """
		SELECT /*+ MAPJOIN (rt) */ *
		FROM (select * from range(100)) lf, (select * from range(1000)) rt
		WHERE lf.id = rt.id
	"""
	sql(qBroadcastRight).explain

	
23. Multi-Dimensional Aggregation	

	val sales = Seq(
		("Warsaw", 2016, 100),
		("Warsaw", 2017, 200),
		("Boston", 2015, 50),
		("Boston", 2016, 150),
		("Toronto", 2017, 50)).toDF("city", "year", "amount")
		
	// very labor-intense
	// groupBy's unioned
	val groupByCityAndYear = sales.groupBy("city", "year").agg(sum("amount") as "amount")
	val groupByCityOnly = sales.groupBy("city").agg(sum("amount") as "amount").select($"city", lit(null) as "year", $"amount")
	val withUnion = groupByCityAndYear.union(groupByCityOnly).sort($"city".desc_nulls_last, $"year".asc_nulls_last)
	withUnion.show

	// Roll up your sleeves!
	val withRollup = sales.rollup("city", "year").agg(sum("amount") as "amount", grouping_id() as "gid").sort($"city".desc_nulls_last, $"year".asc_nulls_last).filter(grouping_id() =!= 3)
					.select("city", "year", "amount")
	withRollup.show

	sales.createOrReplaceTempView("sales")
	val withGroupingSets = sql("""
		SELECT city, year, SUM(amount) as amount
		FROM sales
		GROUP BY city, year
		GROUPING SETS ((city, year), (city))
		ORDER BY city DESC NULLS LAST, year ASC NULLS LAST
		""")
	withGroupingSets.show
	
	import java.time.LocalDate
	import java.sql.Date
	import java.time.Month
	val expenses = Seq(
		((2012, Month.DECEMBER, 12), 5),
		((2016, Month.AUGUST, 13), 10),
		((2017, Month.MAY, 27), 15))
	.map { case ((yy, mm, dd), a) => (LocalDate.of(yy, mm, dd), a) }
	.map { case (d, a) => (d.toString, a) }
	.map { case (d, a) => (Date.valueOf(d), a) }
	.toDF("date", "amount")
	
	expenses.show
	
	// rollup time!
	val q = expenses.rollup(year($"date") as "year", month($"date") as "month").agg(sum("amount") as "amount").sort($"year".asc_nulls_last, $"month".asc_nulls_last)
	q.show
	
	val sales = Seq(
		("Warsaw", 2016, 100),
		("Warsaw", 2017, 200),
		("Boston", 2015, 50),
		("Boston", 2016, 150),
		("Toronto", 2017, 50)
	).toDF("city", "year", "amount")

	val q = sales.rollup("city", "year").agg(sum("amount") as "amount").sort($"city".desc_nulls_last, $"year".asc_nulls_last)
	q.show

	// The above query is semantically equivalent to the following
	val q1 = sales.groupBy("city", "year").agg(sum("amount") as "amount")
	val q2 = sales.groupBy("city").agg(sum("amount") as "amount").select($"city", lit(null) as "year", $"amount")
	val q3 = sales.groupBy().agg(sum("amount") as "amount").select(lit(null) as "city", lit(null) as "year", $"amount")
	val qq = q1.union(q2).union(q3).sort($"city".desc_nulls_last, $"year".asc_nulls_last)
	qq.show
	
	val inventory = Seq(
		("table", "blue", 124),
		("table", "red", 223),
		("chair", "blue", 101),
		("chair", "red", 210)).toDF("item", "color", "quantity")
		
	inventory.rollup("item", "color").sum().show
	
	val quarterlyScores = Seq(
		("winter2014", "Agata", 99),
		("winter2014", "Jacek", 97),
		("summer2015", "Agata", 100),
		("summer2015", "Jacek", 63),
		("winter2015", "Agata", 97),
		("winter2015", "Jacek", 55),
		("summer2016", "Agata", 98),
		("summer2016", "Jacek", 97)).toDF("period", "student", "score")
		
	quarterlyScores.rollup("period", "student").sum("score").show
	
	// using struct function
	inventory.rollup(struct("item", "color") as "(item,color)").sum().show
	
	// using expr function
	inventory.rollup(expr("(item, color)") as "(item, color)").sum().show
	
	val sales = Seq(
		("Warsaw", 2016, 100),
		("Warsaw", 2017, 200),
		("Boston", 2015, 50),
		("Boston", 2016, 150),
		("Toronto", 2017, 50)
	).toDF("city", "year", "amount")

	val q = sales.cube("city", "year").agg(sum("amount") as "amount").sort($"city".desc_nulls_last, $"year".asc_nulls_last)
	q.show

	val sales = Seq(
		("Warsaw", 2016, 100),
		("Warsaw", 2017, 200),
		("Boston", 2015, 50),
		("Boston", 2016, 150),
		("Toronto", 2017, 50)
	).toDF("city", "year", "amount")
	sales.createOrReplaceTempView("sales")
	
	// equivalent to rollup("city", "year")
	val q = sql("""
		SELECT city, year, sum(amount) as amount
		FROM sales
		GROUP BY city, year
		GROUPING SETS ((city, year), (city), ())
		ORDER BY city DESC NULLS LAST, year ASC NULLS LAST
	""")
	q.show

	
24. Dataset Caching and Persistence

	val df = spark.range(1).cache
	df.show
	
	// InMemoryRelation is used for cached queries
	df.queryExecution.withCachedData

	// Notice InMemoryRelation in use for cached queries
	df.withColumn("newId", 'id).explain(extended = true)
	
	// Clear in-memory cache using SQL
	// Equivalent to spark.catalog.clearCache
	sql("CLEAR CACHE").collect
	
	// use SQL’s CACHE TABLE [tableName] to cache tableName table in memory. 
	// CACHE TABLE is an eager operation which is executed as soon as the statement is executed.
	sql("CACHE TABLE [tableName]")

	// use LAZY keyword to make caching lazy.
	sql("CACHE LAZY TABLE [tableName]")

	// use SQL’s REFRESH TABLE [tableName] to refresh a cached table.

	// use SQL’s UNCACHE TABLE (IF EXISTS)? [tableName] to remove a table from cache.

	// usse SQL’s CLEAR CACHE to remove all tables from cache.

	val q1 = spark.range(5).cache.filter($"id" % 2 === 0).select("id")
	val q2 = spark.range(1).filter($"id" % 2 === 0).select("id").cache
	
	// check whether a Dataset was cached or not using the following code:
	
	:type q2
	
	val cache = spark.sharedState.cacheManager
	cache.lookupCachedData(q2.queryExecution.logical).isDefined
	
	// register Dataset as temporary view (table)
	spark.range(1).createOrReplaceTempView("one")

	// caching is lazy and won't happen until an action is executed
	val one = spark.table("one").cache
	
	spark.catalog.isCached("one")

	spark.range(100).createOrReplaceTempView("hundred")

	// SQL's CACHE TABLE is eager
	// The following gives "In-memory table `hundred`"
	spark.sql("CACHE TABLE hundred")
	
	// register Dataset under name
	val ds = spark.range(20)
	spark.sharedState.cacheManager.cacheQuery(ds, Some("twenty"))

	// trigger an action
	ds.head
	
	
25. Standard Functions — functions Object

	import org.apache.spark.sql.functions._
	
	// There are over 200 functions in the functions object
	spark.catalog.listFunctions.count

	val dataset = spark.range(9).withColumn("bucket", 'id % 3)
	
	import org.apache.spark.sql.expressions.Window
	
	val byBucket = Window.partitionBy('bucket).orderBy('id)
	
	dataset.withColumn("rank", rank over byBucket).show
	dataset.withColumn("percent_rank", percent_rank over byBucket).show
	dataset.union(dataset).withColumn("rank", rank over byBucket).show
	dataset.union(dataset).withColumn("dense_rank", dense_rank over byBucket).show
	dataset.union(dataset).withColumn("percent_rank", percent_rank over byBucket).show
	
	val buckets = spark.range(9).withColumn("bucket", 'id % 3)
	val dataset = buckets.union(buckets)
	val windowSpec = Window.partitionBy('bucket).orderBy('id)
	dataset.withColumn("cume_dist", cume_dist over windowSpec).show
	
	val buckets = spark.range(9).withColumn("bucket", 'id % 3)
	val dataset = buckets.union(buckets)
	val windowSpec = Window.partitionBy('bucket).orderBy('id)
	dataset.withColumn("lag", lag('id, 1) over windowSpec).show

	dataset.withColumn("lag", lag('id, 2, "<default_value>") over windowSpec).show
	
	dataset.withColumn("lead", lead('id, 1) over windowSpec).show

	dataset.withColumn("lead", lead('id, 2, "<default_value>") over windowSpec).show
	
	dataset.withColumn("row_number", row_number() over windowSpec).show

	val dataset = spark.range(7).select('*, 'id % 3 as "bucket")
	val byBuckets = Window.partitionBy('bucket).orderBy('id)
	dataset.select('*, ntile(3) over byBuckets as "ntile").show

	
26. UDFs — User-Defined Functions	
	
	val dataset = Seq((0, "hello"), (1, "world")).toDF("id", "text")
	
	// Define a regular Scala function
	val upper: String => String = _.toUpperCase

	// Define a UDF that wraps the upper Scala function defined above
	// You could also define the function in place, i.e. inside udf
	// but separating Scala functions from Spark SQL's UDFs allows for easier testing
	import org.apache.spark.sql.functions.udf
	val upperUDF = udf(upper)

	// Apply the UDF to change the source dataset
	dataset.withColumn("upper", upperUDF('text)).show

	spark.udf.register("myUpper", (input: String) => input.toUpperCase)
	
	spark.catalog.listFunctions.filter('name like "%upper%").show(false)


27. Cost-Based Optimization (CBO) of Logical Query Plan

	val sqlConf = spark.sessionState.conf
	println(sqlConf.cboEnabled)

	val sparkCboEnabled = spark.newSession
	import org.apache.spark.sql.internal.SQLConf.CBO_ENABLED
	sparkCboEnabled.conf.set(CBO_ENABLED.key, true)
	val isCboEnabled = sparkCboEnabled.conf.get(CBO_ENABLED.key)
	println(s"Is CBO enabled? $isCboEnabled")
	
	

	



	

	
	

	






	


	

